/* TextPDF - generate PDF dynamically
 * 
 * Copyright (c) 2015 Lucky Byte, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
package com.lucky_byte.pdf;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;

/**
 * 读取 .doc 文件，并转换为 TextPDF 可识别的模板格式
 */
public class DocReader
{
	private URL xsl_url = null;

	public void setXSLUrl(URL url) {
		this.xsl_url = url;
	}

	private void appendParaAttrs(StringBuilder builder,
			Paragraph para) {
		switch(para.getJustification()) {
		case 1:
			builder.append(" align=\"center\"");
			break;
		case 2:
			builder.append(" align=\"right\"");
			break;
		case 3:	// left 对齐是默认的，不写入模板中
			break;
		}
	}

	private void appendRunAttrs(StringBuilder builder,
			CharacterRun run, boolean is_span) {
		StringBuilder style = new StringBuilder();

		if (is_span) {
			if (run.isBold()) {
				style.append("bold");
			}
			if (run.isItalic()) {
				if (style.length() > 0) style.append(",");
				style.append("italic");
			}
			if (run.getUnderlineCode() == 1) {
				if (style.length() > 0) style.append(",");
				style.append("underline");
			}
			if (style.length() > 0) {
				builder.append(" font-style=\"" + style + "\"");
			}
		}

		builder.append(" font-size=\"");
		builder.append((int)(run.getFontSize() / 2));
		builder.append("\"");
	}

	public void read(InputStream doc_stream, OutputStream xml_stream)
			throws IOException {
		if (doc_stream == null || xml_stream == null) {
			System.err.println("Invalid argument");
			return;
		}

		HWPFDocument document = new HWPFDocument(doc_stream);
		Range range = document.getRange();
		StringBuilder builder = new StringBuilder();

		builder.append("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
		if (xsl_url != null) {
			builder.append("<?xml-stylesheet type=\"text/xsl\" href=\"" +
					xsl_url.getPath() + "\"?>\n");
		}
		builder.append("\n<!-- Automatic generated by TextPDF DocReader -->\n");
		builder.append("\n<textpdf>\n");

		xml_stream.write(builder.toString().getBytes("UTF-8"));
		builder.setLength(0);

		for (int i = 0; i < range.numParagraphs(); i++) {
			Paragraph para = range.getParagraph(i);

			if (para.pageBreakBefore()) {	// 换页符
				builder.append("  <pagebreak />\n");
			}

			builder.append("  <para");
			appendParaAttrs(builder, para);
			builder.append(">\n");

			for (int j = 0; j < para.numCharacterRuns(); j++) {
				CharacterRun run = para.getCharacterRun(j);
				String text = run.text().replaceAll("[\u0000-\u001f]", "");

//				System.out.println("run text: " + text + " >i=" + i);
//				System.out.println("vanished: " + run.isVanished());
//				System.out.println("special: " + run.isSpecialCharacter());

				// 忽略特殊字符
				if (run.isSpecialCharacter()) {
					continue;
				}

				// 忽略级链接
				if (text.matches(" HYPERLINK .+")) {
					continue;
				}

				// \u3000: IDEOGRAPHIC SPACE
				if (run.getUnderlineCode() == 1 &&
						text.matches("^[\\s\u3000]+$")) {
					String vid = "vid_" + i + "_" + j;
					builder.append("    <value id=\"");
					builder.append(vid);
					builder.append("\" minlen=\"");
					builder.append(text.length());
					builder.append("\"");
					appendRunAttrs(builder, run, false);
					builder.append(" />\n");
				} else if (text.matches("^[\\s\u3000]+$")) {
					builder.append("    <hspace");
					builder.append(" size=\"");
					builder.append(text.length());
					builder.append("\"");
					appendRunAttrs(builder, run, false);
					builder.append(" />\n");
				} else if (text.length() > 0) {
//					System.out.println(">>> is NOT a value");
//					if (i == 33) {
//						for (int k = 0; k < text.length(); k++) {
//							System.out.println(Integer.toHexString(text.charAt(k)));
//						}
//					}
					builder.append("    <span");
					appendRunAttrs(builder, run, true);
					builder.append(">");
					builder.append(text);
					builder.append("</span>\n");
				}
			}
			builder.append("  </para>\n");
			xml_stream.write(builder.toString().getBytes("UTF-8"));
			builder.setLength(0);
		}
		xml_stream.write("</textpdf>\n".getBytes());
	}

}
